Importing libraries¶
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
import plotly as po
import plotly.express as px
import scipy.cluster.hierarchy as sch
import scipy.stats as stats
po.offline.init_notebook_mode(connected=True)
Loading the dataset¶
data = pd.read_csv("./CardiacPrediction.csv")
data.head()
| SEQN | Gender | Age | Annual-Family-Income | Ratio-Family-Income-Poverty | X60-sec-pulse | Systolic | Diastolic | Weight | Height | ... | Total-Cholesterol | HDL | Glycohemoglobin | Vigorous-work | Moderate-work | Health-Insurance | Diabetes | Blood-Rel-Diabetes | Blood-Rel-Stroke | CoronaryHeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 1 | 77 | 8 | 5.00 | 68 | 98 | 56 | 75.4 | 174.0 | ... | 5.56 | 1.39 | 4.7 | 3 | 3 | 1 | 2 | 2 | 2 | 0 |
| 1 | 5 | 1 | 49 | 11 | 5.00 | 66 | 122 | 83 | 92.5 | 178.3 | ... | 7.21 | 1.08 | 5.5 | 1 | 1 | 1 | 2 | 2 | 2 | 0 |
| 2 | 12 | 1 | 37 | 11 | 4.93 | 64 | 174 | 99 | 99.2 | 180.0 | ... | 4.03 | 0.98 | 5.2 | 2 | 1 | 1 | 2 | 1 | 1 | 0 |
| 3 | 13 | 1 | 70 | 3 | 1.07 | 102 | 130 | 66 | 63.6 | 157.7 | ... | 8.12 | 1.28 | 7.6 | 3 | 3 | 1 | 1 | 1 | 2 | 0 |
| 4 | 14 | 1 | 81 | 5 | 2.67 | 72 | 136 | 61 | 75.5 | 166.2 | ... | 4.50 | 1.04 | 5.8 | 1 | 1 | 1 | 2 | 2 | 2 | 0 |
5 rows × 51 columns
EDA¶
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 37079 entries, 0 to 37078 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SEQN 37079 non-null int64 1 Gender 37079 non-null int64 2 Age 37079 non-null int64 3 Annual-Family-Income 37079 non-null int64 4 Ratio-Family-Income-Poverty 37079 non-null float64 5 X60-sec-pulse 37079 non-null int64 6 Systolic 37079 non-null int64 7 Diastolic 37079 non-null int64 8 Weight 37079 non-null float64 9 Height 37079 non-null float64 10 Body-Mass-Index 37079 non-null float64 11 White-Blood-Cells 37079 non-null float64 12 Lymphocyte 37079 non-null float64 13 Monocyte 37079 non-null float64 14 Eosinophils 37079 non-null float64 15 Basophils 37079 non-null float64 16 Red-Blood-Cells 37079 non-null float64 17 Hemoglobin 37079 non-null float64 18 Mean-Cell-Vol 37079 non-null float64 19 Mean-Cell-Hgb-Conc. 37079 non-null float64 20 Mean-cell-Hemoglobin 37079 non-null float64 21 Platelet-count 37079 non-null float64 22 Mean-Platelet-Vol 37079 non-null float64 23 Segmented-Neutrophils 37079 non-null float64 24 Hematocrit 37079 non-null float64 25 Red-Cell-Distribution-Width 37079 non-null float64 26 Albumin 37079 non-null int64 27 ALP 37079 non-null int64 28 AST 37079 non-null int64 29 ALT 37079 non-null int64 30 Cholesterol 37079 non-null float64 31 Creatinine 37079 non-null float64 32 Glucose 37079 non-null float64 33 GGT 37079 non-null int64 34 Iron 37079 non-null float64 35 LDH 37079 non-null int64 36 Phosphorus 37079 non-null float64 37 Bilirubin 37079 non-null float64 38 Protein 37079 non-null float64 39 Uric.Acid 37079 non-null float64 40 Triglycerides 37079 non-null float64 41 Total-Cholesterol 37079 non-null float64 42 HDL 37079 non-null float64 43 Glycohemoglobin 37079 non-null float64 44 Vigorous-work 37079 non-null int64 45 Moderate-work 37079 non-null int64 46 Health-Insurance 37079 non-null int64 47 Diabetes 37079 non-null int64 48 Blood-Rel-Diabetes 37079 non-null int64 49 Blood-Rel-Stroke 37079 non-null int64 50 CoronaryHeartDisease 37079 non-null int64 dtypes: float64(31), int64(20) memory usage: 14.4 MB
data.columns
Index(['SEQN', 'Gender', 'Age', 'Annual-Family-Income',
'Ratio-Family-Income-Poverty', 'X60-sec-pulse', 'Systolic', 'Diastolic',
'Weight', 'Height', 'Body-Mass-Index', 'White-Blood-Cells',
'Lymphocyte', 'Monocyte', 'Eosinophils', 'Basophils', 'Red-Blood-Cells',
'Hemoglobin', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.',
'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol',
'Segmented-Neutrophils', 'Hematocrit', 'Red-Cell-Distribution-Width',
'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose',
'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid',
'Triglycerides', 'Total-Cholesterol', 'HDL', 'Glycohemoglobin',
'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes',
'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease'],
dtype='object')
data.describe()
| SEQN | Gender | Age | Annual-Family-Income | Ratio-Family-Income-Poverty | X60-sec-pulse | Systolic | Diastolic | Weight | Height | ... | Total-Cholesterol | HDL | Glycohemoglobin | Vigorous-work | Moderate-work | Health-Insurance | Diabetes | Blood-Rel-Diabetes | Blood-Rel-Stroke | CoronaryHeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | ... | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 | 37079.000000 |
| mean | 48901.041236 | 1.513282 | 48.943661 | 7.358208 | 2.559026 | 72.579250 | 124.090078 | 69.919253 | 80.988276 | 167.389601 | ... | 5.081713 | 1.370344 | 5.676496 | 1.783840 | 1.598856 | 1.218587 | 1.907333 | 1.549502 | 1.796165 | 0.040670 |
| std | 26753.636441 | 0.499830 | 18.010440 | 3.994083 | 1.624789 | 12.242108 | 19.254741 | 13.575804 | 20.678734 | 10.122908 | ... | 1.072682 | 0.415985 | 1.050223 | 0.448324 | 0.511199 | 0.461102 | 0.349674 | 0.497550 | 0.402853 | 0.197527 |
| min | 2.000000 | 1.000000 | 20.000000 | 1.000000 | 0.000000 | 32.000000 | 0.000000 | 0.000000 | 32.300000 | 129.700000 | ... | 1.530000 | 0.160000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 26120.500000 | 1.000000 | 33.000000 | 4.000000 | 1.140000 | 64.000000 | 111.000000 | 62.000000 | 66.500000 | 160.000000 | ... | 4.320000 | 1.070000 | 5.200000 | 2.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 2.000000 | 0.000000 |
| 50% | 50065.000000 | 2.000000 | 48.000000 | 7.000000 | 2.180000 | 72.000000 | 121.000000 | 70.000000 | 78.200000 | 167.100000 | ... | 5.020000 | 1.290000 | 5.400000 | 2.000000 | 2.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 0.000000 |
| 75% | 71173.500000 | 2.000000 | 63.000000 | 10.000000 | 4.130000 | 80.000000 | 134.000000 | 78.000000 | 92.100000 | 174.600000 | ... | 5.740000 | 1.600000 | 5.800000 | 2.000000 | 2.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 0.000000 |
| max | 93702.000000 | 2.000000 | 85.000000 | 15.000000 | 5.000000 | 224.000000 | 270.000000 | 132.000000 | 371.000000 | 204.500000 | ... | 14.090000 | 5.840000 | 18.800000 | 3.000000 | 3.000000 | 9.000000 | 3.000000 | 2.000000 | 2.000000 | 1.000000 |
8 rows × 51 columns
#null_summary = pd.DataFrame({'Null Count': data.isnull().sum(), 'Null Percentage': (data.isnull().sum() / len(data)) * 100})
# Calculate null counts in each column
null_counts = data.isnull().sum()
# Calculate null percentage in each column
total_rows = len(data)
null_percentage = (null_counts / total_rows) * 100
# Print null counts and percentages as a table
print("Column Name | Null Count | Null Percentage")
print("-" * 60) # Separator line
for column in data.columns:
print(f"{column:<30} | {null_counts[column]:<11} | {null_percentage[column]:.2f}%")
Column Name | Null Count | Null Percentage ------------------------------------------------------------ SEQN | 0 | 0.00% Gender | 0 | 0.00% Age | 0 | 0.00% Annual-Family-Income | 0 | 0.00% Ratio-Family-Income-Poverty | 0 | 0.00% X60-sec-pulse | 0 | 0.00% Systolic | 0 | 0.00% Diastolic | 0 | 0.00% Weight | 0 | 0.00% Height | 0 | 0.00% Body-Mass-Index | 0 | 0.00% White-Blood-Cells | 0 | 0.00% Lymphocyte | 0 | 0.00% Monocyte | 0 | 0.00% Eosinophils | 0 | 0.00% Basophils | 0 | 0.00% Red-Blood-Cells | 0 | 0.00% Hemoglobin | 0 | 0.00% Mean-Cell-Vol | 0 | 0.00% Mean-Cell-Hgb-Conc. | 0 | 0.00% Mean-cell-Hemoglobin | 0 | 0.00% Platelet-count | 0 | 0.00% Mean-Platelet-Vol | 0 | 0.00% Segmented-Neutrophils | 0 | 0.00% Hematocrit | 0 | 0.00% Red-Cell-Distribution-Width | 0 | 0.00% Albumin | 0 | 0.00% ALP | 0 | 0.00% AST | 0 | 0.00% ALT | 0 | 0.00% Cholesterol | 0 | 0.00% Creatinine | 0 | 0.00% Glucose | 0 | 0.00% GGT | 0 | 0.00% Iron | 0 | 0.00% LDH | 0 | 0.00% Phosphorus | 0 | 0.00% Bilirubin | 0 | 0.00% Protein | 0 | 0.00% Uric.Acid | 0 | 0.00% Triglycerides | 0 | 0.00% Total-Cholesterol | 0 | 0.00% HDL | 0 | 0.00% Glycohemoglobin | 0 | 0.00% Vigorous-work | 0 | 0.00% Moderate-work | 0 | 0.00% Health-Insurance | 0 | 0.00% Diabetes | 0 | 0.00% Blood-Rel-Diabetes | 0 | 0.00% Blood-Rel-Stroke | 0 | 0.00% CoronaryHeartDisease | 0 | 0.00%
# Calculate the number of distinct values in each column
distinct_counts = data.nunique()
# Print the number of distinct values and distinct values in each column
print("Column Name | Distinct Count ")
print("-" * 50) # Separator line
for column in data.columns:
distinct_values = data[column].unique()
print(f"{column:<30} | {distinct_counts[column]:<14}")
Column Name | Distinct Count -------------------------------------------------- SEQN | 37079 Gender | 2 Age | 66 Annual-Family-Income | 13 Ratio-Family-Income-Poverty | 501 X60-sec-pulse | 60 Systolic | 154 Diastolic | 114 Weight | 1432 Height | 627 Body-Mass-Index | 3235 White-Blood-Cells | 212 Lymphocyte | 610 Monocyte | 218 Eosinophils | 218 Basophils | 70 Red-Blood-Cells | 404 Hemoglobin | 131 Mean-Cell-Vol | 538 Mean-Cell-Hgb-Conc. | 247 Mean-cell-Hemoglobin | 4501 Platelet-count | 551 Mean-Platelet-Vol | 82 Segmented-Neutrophils | 677 Hematocrit | 329 Red-Cell-Distribution-Width | 156 Albumin | 39 ALP | 262 AST | 202 ALT | 224 Cholesterol | 543 Creatinine | 371 Glucose | 616 GGT | 387 Iron | 493 LDH | 312 Phosphorus | 101 Bilirubin | 77 Protein | 57 Uric.Acid | 122 Triglycerides | 798 Total-Cholesterol | 313 HDL | 276 Glycohemoglobin | 127 Vigorous-work | 3 Moderate-work | 3 Health-Insurance | 4 Diabetes | 3 Blood-Rel-Diabetes | 2 Blood-Rel-Stroke | 2 CoronaryHeartDisease | 2
data[data['Gender']==1].count()
SEQN 18047 Gender 18047 Age 18047 Annual-Family-Income 18047 Ratio-Family-Income-Poverty 18047 X60-sec-pulse 18047 Systolic 18047 Diastolic 18047 Weight 18047 Height 18047 Body-Mass-Index 18047 White-Blood-Cells 18047 Lymphocyte 18047 Monocyte 18047 Eosinophils 18047 Basophils 18047 Red-Blood-Cells 18047 Hemoglobin 18047 Mean-Cell-Vol 18047 Mean-Cell-Hgb-Conc. 18047 Mean-cell-Hemoglobin 18047 Platelet-count 18047 Mean-Platelet-Vol 18047 Segmented-Neutrophils 18047 Hematocrit 18047 Red-Cell-Distribution-Width 18047 Albumin 18047 ALP 18047 AST 18047 ALT 18047 Cholesterol 18047 Creatinine 18047 Glucose 18047 GGT 18047 Iron 18047 LDH 18047 Phosphorus 18047 Bilirubin 18047 Protein 18047 Uric.Acid 18047 Triglycerides 18047 Total-Cholesterol 18047 HDL 18047 Glycohemoglobin 18047 Vigorous-work 18047 Moderate-work 18047 Health-Insurance 18047 Diabetes 18047 Blood-Rel-Diabetes 18047 Blood-Rel-Stroke 18047 CoronaryHeartDisease 18047 dtype: int64
data[data['Gender']==2].count()
SEQN 19032 Gender 19032 Age 19032 Annual-Family-Income 19032 Ratio-Family-Income-Poverty 19032 X60-sec-pulse 19032 Systolic 19032 Diastolic 19032 Weight 19032 Height 19032 Body-Mass-Index 19032 White-Blood-Cells 19032 Lymphocyte 19032 Monocyte 19032 Eosinophils 19032 Basophils 19032 Red-Blood-Cells 19032 Hemoglobin 19032 Mean-Cell-Vol 19032 Mean-Cell-Hgb-Conc. 19032 Mean-cell-Hemoglobin 19032 Platelet-count 19032 Mean-Platelet-Vol 19032 Segmented-Neutrophils 19032 Hematocrit 19032 Red-Cell-Distribution-Width 19032 Albumin 19032 ALP 19032 AST 19032 ALT 19032 Cholesterol 19032 Creatinine 19032 Glucose 19032 GGT 19032 Iron 19032 LDH 19032 Phosphorus 19032 Bilirubin 19032 Protein 19032 Uric.Acid 19032 Triglycerides 19032 Total-Cholesterol 19032 HDL 19032 Glycohemoglobin 19032 Vigorous-work 19032 Moderate-work 19032 Health-Insurance 19032 Diabetes 19032 Blood-Rel-Diabetes 19032 Blood-Rel-Stroke 19032 CoronaryHeartDisease 19032 dtype: int64
data['Gender'].value_counts()
Gender 2 19032 1 18047 Name: count, dtype: int64
data['CoronaryHeartDisease'].value_counts()
CoronaryHeartDisease 0 35571 1 1508 Name: count, dtype: int64
Balancinng the data using SMOTE¶
# Assuming 'data' is your DataFrame
# X should contain your feature columns, and y should contain your target variable
# Separate features (X) and target variable (y)
X = data.drop('CoronaryHeartDisease', axis=1)
y = data['CoronaryHeartDisease']
# Define the pipeline with SMOTE and RandomUnderSampler
pipeline = Pipeline([
('under_sampler', RandomUnderSampler(sampling_strategy=0.10, random_state=42)), # 4:1 undersampling
('smote', SMOTE(sampling_strategy='auto', random_state=42)) # Auto-adjusted oversampling
])
# Fit and transform the data using the pipeline
X_resampled, y_resampled = pipeline.fit_resample(X, y)
# Create a new DataFrame with the resampled data
resampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='CoronaryHeartDisease')], axis=1)
resampled_data.head()
| SEQN | Gender | Age | Annual-Family-Income | Ratio-Family-Income-Poverty | X60-sec-pulse | Systolic | Diastolic | Weight | Height | ... | Total-Cholesterol | HDL | Glycohemoglobin | Vigorous-work | Moderate-work | Health-Insurance | Diabetes | Blood-Rel-Diabetes | Blood-Rel-Stroke | CoronaryHeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 61989 | 1 | 46 | 10 | 2.71 | 64 | 114 | 74 | 102.1 | 185.5 | ... | 5.84 | 1.71 | 5.3 | 1 | 1 | 1 | 2 | 2 | 2 | 0 |
| 1 | 47788 | 2 | 75 | 7 | 2.56 | 58 | 120 | 70 | 69.5 | 165.6 | ... | 3.62 | 1.32 | 5.1 | 2 | 2 | 1 | 2 | 1 | 2 | 0 |
| 2 | 68910 | 1 | 74 | 7 | 1.70 | 76 | 138 | 80 | 86.2 | 170.4 | ... | 4.03 | 0.83 | 5.9 | 2 | 2 | 1 | 2 | 2 | 2 | 0 |
| 3 | 24195 | 2 | 46 | 11 | 4.18 | 74 | 122 | 85 | 90.1 | 178.9 | ... | 6.75 | 1.34 | 5.1 | 2 | 2 | 1 | 2 | 2 | 1 | 0 |
| 4 | 52532 | 1 | 80 | 7 | 2.40 | 82 | 92 | 56 | 88.3 | 181.9 | ... | 3.39 | 1.71 | 6.4 | 2 | 2 | 1 | 1 | 1 | 2 | 0 |
5 rows × 51 columns
resampled_data.shape
(30160, 51)
resampled_data['CoronaryHeartDisease'].value_counts()
CoronaryHeartDisease 0 15080 1 15080 Name: count, dtype: int64
resampled_data.columns
Index(['SEQN', 'Gender', 'Age', 'Annual-Family-Income',
'Ratio-Family-Income-Poverty', 'X60-sec-pulse', 'Systolic', 'Diastolic',
'Weight', 'Height', 'Body-Mass-Index', 'White-Blood-Cells',
'Lymphocyte', 'Monocyte', 'Eosinophils', 'Basophils', 'Red-Blood-Cells',
'Hemoglobin', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.',
'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol',
'Segmented-Neutrophils', 'Hematocrit', 'Red-Cell-Distribution-Width',
'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose',
'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid',
'Triglycerides', 'Total-Cholesterol', 'HDL', 'Glycohemoglobin',
'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes',
'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease'],
dtype='object')
Feature Selection¶
# Assuming your dataset has features (X) and labels (y)
# Separate the features and the target variable
X = resampled_data.drop(columns=['CoronaryHeartDisease', 'SEQN','Health-Insurance','Cholesterol']) # Features
y = resampled_data['CoronaryHeartDisease'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 1: Apply SelectKBest
k_best_selector = SelectKBest(chi2, k=30) # Adjust k as needed
X_train_k_best = k_best_selector.fit_transform(X_train, y_train)
selected_features = X.columns[k_best_selector.get_support()]
# Step 2: Apply RandomForestClassifier for further feature importance
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_k_best, y_train)
# Display feature importance scores
feature_importances = pd.DataFrame(rf_classifier.feature_importances_,
index=selected_features,
columns=['importance']).sort_values('importance', ascending=False)
print("Selected Features:")
print(selected_features)
print("\nFeature Importance Scores:")
print(feature_importances)
Selected Features:
Index(['Gender', 'Age', 'Annual-Family-Income', 'X60-sec-pulse', 'Systolic',
'Diastolic', 'Weight', 'Lymphocyte', 'Monocyte', 'Eosinophils',
'Mean-Cell-Vol', 'Platelet-count', 'Segmented-Neutrophils', 'Albumin',
'ALP', 'ALT', 'Creatinine', 'Glucose', 'GGT', 'Iron', 'LDH',
'Bilirubin', 'Uric.Acid', 'Triglycerides', 'Total-Cholesterol',
'Glycohemoglobin', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes',
'Blood-Rel-Stroke'],
dtype='object')
Feature Importance Scores:
importance
Age 0.205972
Gender 0.094800
Diabetes 0.066087
Creatinine 0.048159
Glycohemoglobin 0.045138
Glucose 0.033609
Total-Cholesterol 0.030954
Blood-Rel-Stroke 0.029520
Diastolic 0.027677
Triglycerides 0.026348
Albumin 0.025268
X60-sec-pulse 0.023563
Uric.Acid 0.023294
Platelet-count 0.022489
LDH 0.021782
Bilirubin 0.021246
Lymphocyte 0.021055
Blood-Rel-Diabetes 0.020636
Weight 0.020018
Systolic 0.019860
Eosinophils 0.019822
Mean-Cell-Vol 0.019739
Monocyte 0.019480
Annual-Family-Income 0.018784
Iron 0.018568
Segmented-Neutrophils 0.017016
ALT 0.016564
ALP 0.016124
GGT 0.016045
Moderate-work 0.010383
# Visualize feature importance with a red line indicating the threshold
threshold = 0.0225 # Adjust the threshold as needed
plt.figure(figsize=(20, 10))
ax = plt.barh(feature_importances.index, feature_importances['importance'], color='skyblue')
plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold ({threshold})')
plt.xlabel('Importance Score')
plt.title('Feature Importance Scores')
plt.legend()
plt.show()
# Print features above the threshold line
selected_features_above_threshold = feature_importances[feature_importances['importance'] > threshold].index
print("\nFeatures Above the Threshold:")
print(selected_features_above_threshold)
print(len(selected_features_above_threshold))
Features Above the Threshold:
Index(['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose',
'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides',
'Albumin', 'X60-sec-pulse', 'Uric.Acid'],
dtype='object')
13
features=list(selected_features_above_threshold)
print(features)
features.append('CoronaryHeartDisease')
final_data=data[features]
final_data.head()
['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose', 'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides', 'Albumin', 'X60-sec-pulse', 'Uric.Acid']
| Age | Gender | Diabetes | Creatinine | Glycohemoglobin | Glucose | Total-Cholesterol | Blood-Rel-Stroke | Diastolic | Triglycerides | Albumin | X60-sec-pulse | Uric.Acid | CoronaryHeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 77 | 1 | 2 | 61.9 | 4.7 | 4.330 | 5.56 | 2 | 56 | 1.298 | 45 | 68 | 362.8 | 0 |
| 1 | 49 | 1 | 2 | 70.7 | 5.5 | 5.273 | 7.21 | 2 | 83 | 3.850 | 45 | 66 | 404.5 | 0 |
| 2 | 37 | 1 | 2 | 88.4 | 5.2 | 4.163 | 4.03 | 1 | 99 | 1.581 | 47 | 64 | 339.0 | 0 |
| 3 | 70 | 1 | 1 | 61.9 | 7.6 | 7.882 | 8.12 | 2 | 66 | 3.635 | 40 | 102 | 410.4 | 0 |
| 4 | 81 | 1 | 2 | 88.4 | 5.8 | 6.384 | 4.50 | 2 | 61 | 0.756 | 45 | 72 | 368.8 | 0 |
Data Analysis¶
sns.histplot(data=data, x='Age',hue='CoronaryHeartDisease',kde=True)
<Axes: xlabel='Age', ylabel='Count'>
# Filter the DataFrame for each gender
male_data = data[data['Gender'] == 1]
female_data = data[data['Gender'] == 2]
# Create separate figures for each gender's pie chart
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Plot pie chart for Male data
male_counts = male_data['CoronaryHeartDisease'].value_counts()
axes[0].pie(male_counts, labels=male_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Male - CoronaryHeartDisease Distribution')
# Plot pie chart for Female data
female_counts = female_data['CoronaryHeartDisease'].value_counts()
axes[1].pie(female_counts, labels=female_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Female - CoronaryHeartDisease Distribution')
plt.tight_layout()
plt.show()
# Selecting the features for the correlation matrix
features = ['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose',
'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides',
'Albumin', 'X60-sec-pulse', 'Uric.Acid', 'CoronaryHeartDisease']
# Create a correlation matrix
corr_matrix = final_data.corr()
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(12, 10))
# Create a heatmap with annotations
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
# Show the plot
plt.show()
# Selecting the features for which you want to plot the dendrogram
features = ['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose',
'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides',
'Albumin', 'X60-sec-pulse', 'Uric.Acid']
# Create a correlation matrix
corr_matrix = final_data.corr()
# Create a dendrogram
dendrogram = sch.dendrogram(sch.linkage(corr_matrix, method='ward'), labels=corr_matrix.columns, orientation='right')
# Add title and show the plot
plt.title('Dendrogram')
plt.show()
# Normalize the 'Triglycerides' values for bubble sizes
normalized_triglycerides = (final_data['Triglycerides'] - final_data['Triglycerides'].min()) / (final_data['Triglycerides'].max() - final_data['Triglycerides'].min())
# Create an interactive scatter plot using Plotly Express
fig = px.scatter(final_data, x='Age', y='Uric.Acid', size=normalized_triglycerides, color='CoronaryHeartDisease')
# Update layout for better interactivity
fig.update_layout(
title='Interactive Scatter Plot',
xaxis_title='Age',
yaxis_title='Uric Acid',
legend_title='Coronary Heart Disease',
width=900, # Set the width of the plot
height=600, # Set the height of the plot
template='plotly_dark', # Set the template to 'ggplot2'
)
# Show the interactive plot
fig.show()
# Create an interactive strip plot with Gender
fig = px.strip(final_data, color='CoronaryHeartDisease', x='Creatinine', y='Uric.Acid',
#category_orders={'Gender': [1, 2]}, # Specify the order of categories
title='Interactive Categorical Scatter Plot',
labels={'Total-Cholesterol': 'Total-Cholesterol', 'CoronaryHeartDisease': 'Heart Disease'},
width=800, height=500)
# Show the plot
fig.show()
# Line Plot
sns.lineplot(x='Age', y='Total-Cholesterol', hue='CoronaryHeartDisease', data=final_data)
plt.show()
Hypothesis Testing¶
Hypothesis 1 (T-Test):
Null Hypothesis (H0): The mean age of individuals with CoronaryHeartDisease is equal to the mean age of individuals without CoronaryHeartDisease.
Alternative Hypothesis (H1): The mean age of individuals with CoronaryHeartDisease is different from the mean age of individuals without CoronaryHeartDisease.
Test: Independent samples t-test comparing the age distribution between the two groups. Hypothesis 2 (Prop Test):
# Hypothesis 1: T-Test for Age
alpha = 0.05
age_with_disease = data.loc[data['CoronaryHeartDisease'] == 1, 'Age']
age_without_disease = data.loc[data['CoronaryHeartDisease'] == 0, 'Age']
t_stat_age, p_value_age = stats.ttest_ind(age_with_disease, age_without_disease)
if p_value_age < alpha:
print("Hypothesis 1 Result: Reject the null hypothesis. The mean age is significantly different between individuals with and without CoronaryHeartDisease.")
else:
print("Hypothesis 1 Result: Fail to reject the null hypothesis. There is no significant difference in mean age between individuals with and without CoronaryHeartDisease.")
Hypothesis 1 Result: Reject the null hypothesis. The mean age is significantly different between individuals with and without CoronaryHeartDisease.
mean_age_by_chd = final_data.groupby('CoronaryHeartDisease')['Age'].mean()
# Plotting the bar plot
plt.figure(figsize=(8, 6))
mean_age_by_chd.plot(kind='bar', color='skyblue')
plt.title('Mean Age of Individuals with and without Coronary Heart Disease')
plt.xlabel('Coronary Heart Disease')
plt.ylabel('Mean Age')
plt.xticks(ticks=[0, 1], labels=['Without CHD', 'With CHD'])
plt.grid(axis='y')
plt.show()
Null Hypothesis (H0): The mean Glycohemoglobin level is equal in individuals with and without CoronaryHeartDisease.
Alternative Hypothesis (H1): The mean Glycohemoglobin level is different in individuals with and without CoronaryHeartDisease.
Test: Independent samples t-test comparing Glycohemoglobin levels between the two groups.
# Hypothesis 2: T-Test for Glycohemoglobin
glycohemoglobin_with_disease = data.loc[data['CoronaryHeartDisease'] == 1, 'Glycohemoglobin']
glycohemoglobin_without_disease = data.loc[data['CoronaryHeartDisease'] == 0, 'Glycohemoglobin']
t_stat_glycohemoglobin, p_value_glycohemoglobin = stats.ttest_ind(glycohemoglobin_with_disease, glycohemoglobin_without_disease)
if p_value_glycohemoglobin < alpha:
print("Hypothesis 4 Result: Reject the null hypothesis. The mean Glycohemoglobin level is significantly different between individuals with and without CoronaryHeartDisease.")
else:
print("Hypothesis 4 Result: Fail to reject the null hypothesis. There is no significant difference in mean Glycohemoglobin level between individuals with and without CoronaryHeartDisease.")
Hypothesis 4 Result: Reject the null hypothesis. The mean Glycohemoglobin level is significantly different between individuals with and without CoronaryHeartDisease.
# Interactive Scatter Plot with Trend Line
fig = px.scatter(final_data, x='Glycohemoglobin', y='Glucose', color='CoronaryHeartDisease', trendline='ols')
fig.show()
Hypothesis test for Pulse rate and Blood rel stroke
H0: Having a history of Coronary Heart disease does not have any affect on the pulse rate of people.
H_A: Having a history of Coronary Heart disease has an affect on the pulse rate of people.
#Hypothesis 3: CHD and X60 pulse rate
import pandas as pd
from scipy.stats import mannwhitneyu
# Assuming 'df' is your DataFrame containing 'X60-sec-pulse' and 'Blood-Rel-Stroke' columns
# Replace 'X60-sec-pulse' and 'Blood-Rel-Stroke' with the actual column names in your DataFrame
# Separate pulse rates for individuals with and without a family history of stroke
pulse_with_stroke = final_data[final_data['Blood-Rel-Stroke'] == 1]['X60-sec-pulse']
pulse_without_stroke = final_data[final_data['Blood-Rel-Stroke'] == 2]['X60-sec-pulse']
# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(pulse_with_stroke, pulse_without_stroke)
# Display test results
print(f"Mann-Whitney U Statistic: {statistic}")
print(f"P-value: {p_value}")
# Compare the p-value with the significance level (e.g., 0.05) to draw conclusions
if p_value<0.05:
print("The Null Hypothesis is rejected, Having a history of Coronary Heart disease has an affect on the pulse rate of people.")
else:
print("The Null Hypothesis is accepted, Having a history of Coronary Heart disease does not have an affect on the pulse rate of people.")
Mann-Whitney U Statistic: 115365137.5 P-value: 4.452551164503746e-06 The Null Hypothesis is rejected, Having a history of Coronary Heart disease has an affect on the pulse rate of people.
sns.lmplot(data=final_data,x='Age',y='X60-sec-pulse',hue='CoronaryHeartDisease', col='Blood-Rel-Stroke',height=5)
<seaborn.axisgrid.FacetGrid at 0x2452cb125d0>
Hypothesis about diabeties:
H0: The proportion of people suffering with CoronaryHeartDisease is same across all the 3 diabeties types.
H_A: The proportion of people suffering with CoronaryHeartDisease is higher with Diabeties type 2 patients than the rest.
# Hypothesis 4: Chi2 test for Diabeties
import pandas as pd
from scipy.stats import chi2_contingency
# Create a contingency table
contingency_table = pd.crosstab(final_data['Diabetes'], final_data['CoronaryHeartDisease'])
# Perform the Chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)
# Significance level (alpha)
alpha = 0.05
# Print the results
print(f'Chi-square Test for Independence between Diabetes Type and CoronaryHeartDisease:')
print(f'p-value = {p_value}')
# Check if the p-value is less than alpha to make a decision
if p_value < alpha:
print('Reject the null hypothesis. There is evidence to suggest that the proportion of people suffering from CoronaryHeartDisease is different across diabetes types.')
else:
print('Fail to reject the null hypothesis. There is not enough evidence to suggest a difference in the proportion of people suffering from CoronaryHeartDisease across diabetes types.')
Chi-square Test for Independence between Diabetes Type and CoronaryHeartDisease: p-value = 1.6655000663136123e-148 Reject the null hypothesis. There is evidence to suggest that the proportion of people suffering from CoronaryHeartDisease is different across diabetes types.
# Calculate proportions of CHD occurrences for each type of Diabetes
proportions = final_data.groupby('Diabetes')['CoronaryHeartDisease'].mean()
# Plotting the proportions
plt.figure(figsize=(8, 6))
proportions.plot(kind='bar', color='skyblue')
plt.title('Proportion of CHD across Diabetes Types')
plt.xlabel('Diabetes Types')
plt.ylabel('Proportion of CHD')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.axhline(proportions[2], color='red', linestyle='--', label='Diabetes Type 2 Proportion')
plt.legend()
plt.show()
# Violin Plot of Glucose by Diabetes Status
plt.figure(figsize=(12, 8))
sns.violinplot(x='Diabetes',hue='CoronaryHeartDisease', y='Glucose', data=final_data,legend=True,split=True)
plt.title('Violin Plot of Glucose by Diabetes Status')
plt.show()
# This is formatted as code
Hypothesis about X60 pulse rate and age:
H0: The pulse rate varies the same for people with and without Coronary Heart Disease.
H_A: The pulse rate varies the rigorously for people with Coronary Heart Disease than people without any heart disease.
#Hypothesis 5:
from scipy.stats import ttest_ind
# Example data (replace with your actual dataset)
group_with_CHD = data[data['CoronaryHeartDisease'] == 1]['X60-sec-pulse']
group_without_CHD = data[data['CoronaryHeartDisease'] == 0]['X60-sec-pulse']
# Perform the t-test
stat, p_value = ttest_ind(group_with_CHD, group_without_CHD, equal_var=False)
# Significance level (alpha)
alpha = 0.05
# Print the results
print(f't-Test for X60 pulse rate and CoronaryHeartDisease:')
print(f'p-value = {p_value}')
# Check if the p-value is less than alpha to make a decision
if p_value < alpha:
print('Reject the null hypothesis. There is evidence to suggest that the pulse rate varies rigorously for people with CoronaryHeartDisease than people without any heart disease.')
else:
print('Fail to reject the null hypothesis. There is not enough evidence to suggest a difference in the variation of pulse rate between people with and without CoronaryHeartDisease.')
t-Test for X60 pulse rate and CoronaryHeartDisease: p-value = 2.155744320482476e-41 Reject the null hypothesis. There is evidence to suggest that the pulse rate varies rigorously for people with CoronaryHeartDisease than people without any heart disease.
# Line Plot
sns.lineplot(x='Age', y='X60-sec-pulse', hue='CoronaryHeartDisease', data=final_data)
plt.show()
resampled_data.columns
Index(['SEQN', 'Gender', 'Age', 'Annual-Family-Income',
'Ratio-Family-Income-Poverty', 'X60-sec-pulse', 'Systolic', 'Diastolic',
'Weight', 'Height', 'Body-Mass-Index', 'White-Blood-Cells',
'Lymphocyte', 'Monocyte', 'Eosinophils', 'Basophils', 'Red-Blood-Cells',
'Hemoglobin', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.',
'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol',
'Segmented-Neutrophils', 'Hematocrit', 'Red-Cell-Distribution-Width',
'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose',
'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid',
'Triglycerides', 'Total-Cholesterol', 'HDL', 'Glycohemoglobin',
'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes',
'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease'],
dtype='object')
selected_features_above_threshold
Index(['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose',
'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides',
'Albumin', 'X60-sec-pulse', 'Uric.Acid'],
dtype='object')
features=list(selected_features_above_threshold)
print(features)
['Age', 'Gender', 'Diabetes', 'Creatinine', 'Glycohemoglobin', 'Glucose', 'Total-Cholesterol', 'Blood-Rel-Stroke', 'Diastolic', 'Triglycerides', 'Albumin', 'X60-sec-pulse', 'Uric.Acid']
features.append('CoronaryHeartDisease')
final_data=resampled_data[features]
final_data.head()
| Age | Gender | Diabetes | Creatinine | Glycohemoglobin | Glucose | Total-Cholesterol | Blood-Rel-Stroke | Diastolic | Triglycerides | Albumin | X60-sec-pulse | Uric.Acid | CoronaryHeartDisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 46 | 1 | 2 | 94.59 | 5.3 | 4.66 | 5.84 | 2 | 74 | 0.858 | 42 | 64 | 237.9 | 0 |
| 1 | 75 | 2 | 2 | 81.33 | 5.1 | 5.00 | 3.62 | 2 | 70 | 1.840 | 35 | 58 | 243.9 | 0 |
| 2 | 74 | 1 | 2 | 60.11 | 5.9 | 5.22 | 4.03 | 2 | 80 | 3.252 | 41 | 76 | 285.5 | 0 |
| 3 | 46 | 2 | 2 | 70.72 | 5.1 | 4.88 | 6.75 | 1 | 85 | 1.118 | 44 | 74 | 362.8 | 0 |
| 4 | 80 | 1 | 1 | 137.02 | 6.4 | 6.72 | 3.39 | 2 | 56 | 0.610 | 41 | 82 | 440.2 | 0 |
Model Training and Evaluation¶
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score
import matplotlib.pyplot as plt
# Assume X contains features and y contains the target variable
X = resampled_data.drop('CoronaryHeartDisease', axis=1)
y = resampled_data['CoronaryHeartDisease']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize models
svm_model = SVC(kernel='linear', random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=5)
# Train and predict using each model
models = [svm_model, rf_model, lr_model, gb_model, knn_model]
model_names = ['SVM', 'Random Forest', 'Logistic Regression', 'Gradient Boosting', 'KNN']
accuracies = []
roc_auc_scores = []
f1_scores = []
auc_scores = []
plt.figure(figsize=(8, 6))
for model in models:
if model == knn_model:
model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)
else:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracies.append(accuracy)
f1 = f1_score(y_test, predictions)
f1_scores.append(f1)
auc = roc_auc_score(y_test, predictions)
auc_scores.append(auc)
fpr, tpr, _ = roc_curve(y_test, predictions)
plt.plot(fpr, tpr, label=model_names[models.index(model)])
# Labels and Legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend()
plt.grid(True)
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
# Line chart for accuracies
plt.figure(figsize=(10, 6))
plt.plot(model_names, accuracies, marker='o', linestyle='-', color='b')
plt.title('Model Accuracies')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0, 1.0)
plt.grid(True)
plt.show()
for i, model_name in enumerate(model_names):
print(f'{model_name} - Accuracy: {accuracies[i]}')
SVM - Accuracy: 0.8474801061007957 Random Forest - Accuracy: 0.9121352785145889 Logistic Regression - Accuracy: 0.8234416445623343 Gradient Boosting - Accuracy: 0.875 KNN - Accuracy: 0.8851127320954907
# Line chart for F1 scores
plt.figure(figsize=(10, 6))
plt.plot(model_names, f1_scores, marker='o', linestyle='-', color='b')
plt.title('Model F1 Scores')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.ylim(0, 1.0)
plt.grid(True)
plt.show()
for i, model_name in enumerate(model_names):
print(f'{model_name} - F1 score: {f1_scores[i]}')
SVM - F1 score: 0.8494764397905759 Random Forest - F1 score: 0.9133420536298233 Logistic Regression - F1 score: 0.8257241040746195 Gradient Boosting - F1 score: 0.8766361256544503 KNN - F1 score: 0.8911917098445596